Contents

In this document we will:

  1. Divide cohort in to high and low risk groups + look at which clinical variables characterize the groups

  2. Do differential gene expression analysis (low risk vs high risk) and plot results. Significance level: p-adj=FDR=0.05

  3. Do a gene set enrichment analysis using fgsea. Significance level BH (Benjamini-Hochberg) adjusted p-value=FDR= 0.05

Making risk based groups

load("data/tumor_samples_with_subtype.Rdata")

#make clinical onehot
clin_vars <- c("Age", "Neuter_status", "Grade", "Lymphatic_invasion", "ER_status", "HER2_score", "Breed_Maltese", "Malignitet", "subtype_MPAM50", "Breed", "Histology")
data_for_table <- tumor_samples_with_subtype %>% 
  mutate(subtype_MPAM50 = as.factor(subtype_MPAM50)) %>% 
  kNN(variable = clin_vars, k=5) %>% 
  drop_na("Survival_days") %>% 
  select(c(Sample_ID, all_of(clin_vars), Status_survival, Survival_days)) %>%  
  # Only keeping variables we will use
  as_tibble()

clin_vars = c("Age", "ER_status", "Lymphatic_invasion")

clinical_tbl <- tumor_samples_with_subtype %>% 
  mutate(subtype_MPAM50 = as.factor(subtype_MPAM50)) %>% 
  kNN(variable = clin_vars, k=5) %>% 
  drop_na("Survival_days") %>% 
  mutate(Age = as.vector(scale(Age))) %>%
  select(c(Sample_ID, all_of(clin_vars), Status_survival, Survival_days)) %>%  
  # Only keeping variables we will use
  as_tibble()

# Making treatment contrast groups
clinical_onehot <- clinical_tbl %>%
  mutate(ER_positive = ifelse(ER_status == "N", 0, 1)) %>%
  mutate(Lymphatic_invasion_present = ifelse(Lymphatic_invasion == "Absent", 0, 1)) %>%
  select(-ER_status, -Lymphatic_invasion)

# Fitting the Clinical model on all dogs to get betas to predict from
Cox.clinical <- coxph(Surv(Survival_days, Status_survival) ~ Age +  ER_positive + Lymphatic_invasion_present, data = clinical_onehot)
Predicted_risk_multivariate_clinical <- tibble(Sample_ID = clinical_onehot$Sample_ID, Predicted_risk= Cox.clinical$linear.predictors) %>% column_to_rownames("Sample_ID")

Distribution of risk scores, range, median and mean

#distribution of risk scores
risk_histogram <- ggplot(data = Predicted_risk_multivariate_clinical, aes(Predicted_risk))+
  geom_histogram(binwidth = 0.2)+
  geom_vline(xintercept = -0.65, color = "red")+
  theme_pubclean()+
  xlab("Predicted risk score")+
  ylab("")+
  theme(axis.title=element_text(size=14))
risk_histogram

ggsave(risk_histogram,file="plots/risk_histogram_2025_01_20_one_model.jpeg", height = 4, width = 6.4)

range(Predicted_risk_multivariate_clinical$Predicted_risk)
## [1] -2.739613  1.963318
median(Predicted_risk_multivariate_clinical$Predicted_risk)
## [1] -0.6041138
#Stratifying cohort into low-risk and high-risk groups 
high_risk_dogs <- Predicted_risk_multivariate_clinical %>% 
  filter(Predicted_risk>=median(Predicted_risk_multivariate_clinical$Predicted_risk)) %>% 
  rownames()
low_risk_dogs <- Predicted_risk_multivariate_clinical %>% 
  filter(Predicted_risk<median(Predicted_risk_multivariate_clinical$Predicted_risk)) %>% 
  rownames()

# adding risk-info to clinical onehot and clinical_tbl
clinical_onehot <- clinical_onehot %>% 
  mutate("Risk_group_high" = ifelse(clinical_onehot$Sample_ID %in% high_risk_dogs, 1, 0))

clinical_tbl <- clinical_tbl %>% 
  mutate("Risk_group" = ifelse(clinical_onehot$Sample_ID %in% high_risk_dogs, "high", "low"))
data_for_table<- data_for_table %>% 
  mutate("Risk_group" = ifelse(clinical_onehot$Sample_ID %in% high_risk_dogs, "high", "low"))

Group differences in survival

# Univariate cox to get Hazard ratio
coxph(Surv(Survival_days, Status_survival) ~ Risk_group_high, data = clinical_onehot)
## Call:
## coxph(formula = Surv(Survival_days, Status_survival) ~ Risk_group_high, 
##     data = clinical_onehot)
## 
##                   coef exp(coef) se(coef)     z       p
## Risk_group_high 0.9845    2.6764   0.3511 2.804 0.00505
## 
## Likelihood ratio test=8.54  on 1 df, p=0.003472
## n= 146, number of events= 38
# KM plot of the risk groups
fitrisk <- survfit(Surv(clinical_tbl$Survival_days, clinical_tbl$Status_survival)~Risk_group, data=clinical_tbl)
riskKM.plot <- ggsurvplot(fitrisk, data=clinical_tbl, 
                        pval = T, 
                        conf.int=F, 
                        legend="none",
                        palette= c("deeppink3", "chartreuse3"),
                        font.title = 25, 
                        font.x = 18, 
                        font.y = 18, 
                        xlab = "Days")

riskKM.plot$plot <- riskKM.plot$plot +
                     ggplot2::annotate("text", 
                                x = 120, y = 0.1, # x and y coordinates of the text
                                label = "HR = 2.68", size = 5)

ggsave(file = "plots/riskKM.plot.one_model.png", plot= print(riskKM.plot$plot), height = 4.1, width = 5.7)

Clinical variables’ relation to risk groups

tablerisk <- table_descriptive <- tableby(Risk_group ~ Age + Breed_Maltese + Neuter_status + subtype_MPAM50 + Lymphatic_invasion + ER_status +  HER2_score +Grade, data = data_for_table) 
# labels(table_descriptive) <- c(Neuter_status = "Neuter status", Breed_Maltese = "Breed group", Lymphatic_invasion = "Lymphatic invasion", ER_status = "ER status", HER2_score = "HER2 score", subtype_MPAM50 = "MPAM50 Subtype") # Does not work
summary(tablerisk)
high (N=74) low (N=72) Total (N=146) p value
Age < 0.001
   Mean (SD) 13.851 (1.576) 9.847 (2.324) 11.877 (2.816)
   Range 10.000 - 19.000 2.000 - 12.000 2.000 - 19.000
Breed_Maltese 0.229
   Maltese 18 (24.3%) 24 (33.3%) 42 (28.8%)
   Non-Maltese 56 (75.7%) 48 (66.7%) 104 (71.2%)
Neuter_status 0.006
   Intact 43 (58.1%) 57 (79.2%) 100 (68.5%)
   Neutered 31 (41.9%) 15 (20.8%) 46 (31.5%)
subtype_MPAM50 0.002
   Basal 40 (54.1%) 21 (29.2%) 61 (41.8%)
   LumA 34 (45.9%) 51 (70.8%) 85 (58.2%)
Lymphatic_invasion < 0.001
   Absent 59 (79.7%) 72 (100.0%) 131 (89.7%)
   Present 15 (20.3%) 0 (0.0%) 15 (10.3%)
ER_status < 0.001
   N 21 (28.4%) 2 (2.8%) 23 (15.8%)
   P 53 (71.6%) 70 (97.2%) 123 (84.2%)
HER2_score 0.094
   0 8 (10.8%) 3 (4.2%) 11 (7.5%)
   1 16 (21.6%) 10 (13.9%) 26 (17.8%)
   2 30 (40.5%) 43 (59.7%) 73 (50.0%)
   3 20 (27.0%) 16 (22.2%) 36 (24.7%)
Grade < 0.001
   Benign 7 (9.5%) 26 (36.1%) 33 (22.6%)
   1 31 (41.9%) 34 (47.2%) 65 (44.5%)
   2 12 (16.2%) 10 (13.9%) 22 (15.1%)
   3 24 (32.4%) 2 (2.8%) 26 (17.8%)

Differential gene expression

DESeq2 analysis - comparing low-risk and high-risk groups

We will follow the workflow described in the DESeq2 vignette. http://bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#why-after-vst-are-there-still-batches-in-the-pca-plot . Low-risk is set to reference level.

Below is a description of what the DESeq() function does

load("data/counts_tumors.Rdata")
counts_tumors <- counts_tumors %>% 
  select(clinical_tbl$Sample_ID)
# Formatting gene names to syntactically valid names
rownames(counts_tumors) <- make.names(rownames(counts_tumors))

# Normalizing count data with DESeq2. 
# Used clinical_tbl(which has the original variables) and not the clinical_onehot. 
dds_tumor <- DESeqDataSetFromMatrix(countData = counts_tumors, 
                                    colData = clinical_tbl, 
                                    design = ~ Risk_group) 


dds_tumor$Risk_group <- relevel(dds_tumor$Risk_group, ref = "low") #Setting low-riak as the reference level

### Pre-filtering: not necessary but recommended since it will decrease run time and memory size dds_tumor ------
keep <- rowSums(counts(dds_tumor)) >= 10 
dds_tumor <- dds_tumor[keep,]

dds_tumor <- DESeq(dds_tumor)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 2013 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
res <- results(dds_tumor) 

DGE list for all genes (significant and non-significant)

datatable(as.data.frame(res), options = list(pageLength = 12, scrollX = T), caption = htmltools::tags$caption(style = 'caption-side: top; text-align: left;', 'Differential gene expression list'))
res_save <- res %>% as.data.frame() %>% rownames_to_column(var = "Gene_name")
openxlsx::write.xlsx(res_save, 'plots/results_DGE_one_model.xlsx')

Summary of results

With FDR= 0.05, there are 2073 differentially expressed genes

res05 <- results(dds_tumor, alpha=0.05) 
summary(res05)
## 
## out of 16995 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 1493, 8.8%
## LFC < 0 (down)     : 580, 3.4%
## outliers [1]       : 0, 0%
## low counts [2]     : 337, 2%
## (mean count < 0)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
sum(res05$padj < 0.05, na.rm=TRUE) #number of genes with padj below 0.05
## [1] 2073

Plots of result

diff_expr_results <- res %>% as.data.frame() %>% rownames_to_column(.,var = "GeneName")
sign <- res$padj < 0.05 & diff_expr_results$log2FoldChange < -1 | diff_expr_results$padj < 0.05 & diff_expr_results$log2FoldChange > 1
not_sign <- !sign
sign_low_risk <- diff_expr_results$padj < 0.05 & diff_expr_results$log2FoldChange < -1
sign_high_risk <- diff_expr_results$padj < 0.05 & diff_expr_results$log2FoldChange > 1

volcano_tumor <- ggplot(diff_expr_results, aes(log2FoldChange, -log10(padj)))+
  geom_point(data = diff_expr_results[not_sign,],alpha = 0.5, size = 1)+
  geom_point(data = diff_expr_results[sign_high_risk,], fill = "deeppink3",color = "black", size = 3, alpha = 0.7, shape = 21)+
  geom_point(data = diff_expr_results[sign_low_risk,], fill = "chartreuse3",color = "black", size = 3, alpha = 0.7, shape = 21)+
  geom_text_repel(label = diff_expr_results$GeneName[sign], data = diff_expr_results[sign,], max.overlaps = 15)+
  geom_hline(yintercept = -log10(0.05), color = "lightgrey", linetype = 2)+
  geom_vline(xintercept = c(-1,1), color = "lightgrey",  linetype = 2)+
  labs(x = "Log 2 fold change", y = "-log10(FDR)")+
  #lims(x = c(-2.5,2.5))+
  #ggtitle("DGE low-risk vs high-risk")+
  theme_classic2(base_size = 16)
volcano_tumor

ggsave(file = "plots/Volcano.plot.one_model.png", plot= volcano_tumor, height = 7, width = 8)

Gene set enrichment analysis

GSEA using fgsea

I have used the approach for fgsea described on this website: https://stephenturner.github.io/deseq-to-fgsea/. I used the hallmark gene set for both analyses (as this set was recommended to start with) Codes for the bottom 4 plots are adapted from the fgsea vignette: https://bioconductor.org/packages/devel/bioc/vignettes/fgsea/inst/doc/fgsea-tutorial.html

1. Load and tidy ranked gene list + load Msigdb gene set

# Extracting two rows from the ranked gene list from DESeq2 (res), as we only need the gene name and test statistic.
res2 <- res %>% 
  as_tibble(rownames = "gene_symbol") %>% 
  dplyr::select(gene_symbol, stat)

#Loading hallmark pathways
pathways.hallmark <- gmtPathways("data/msigdb/msigdb_files/h.all.v2022.1.Hs.symbols.gmt")

2. Run fgsea and tidy results

## Run fgsea
ranks <- deframe(res2)

fgseaRes <- fgsea(pathways=pathways.hallmark, stats=ranks, nPermSimple=10000)

## Tidying: arrangring by descending NES
fgseaResTidy <- fgseaRes %>%
  as_tibble() %>%
  arrange(desc(NES))

3. Results: table and plots

Table of GSEA results

The table below is ordered by descending normalized enrichment scores (NES).

#Making the table
fgseaResTidy %>% 
  dplyr::select(-leadingEdge, -ES) %>% 
 # arrange(padj) %>% #include this to arrange by padj value
  datatable(. , options = list(pageLength = 12, scrollX = T), caption = htmltools::tags$caption(style = 'caption-side: top; text-align: left;', 'Top enriched pathways'))
openxlsx::write.xlsx(fgseaResTidy, 'plots/results_GSEA.xlsx')
Plotted enrichment scores

Plot the normalized enrichment scores. The color of the bar indicates whether or not the pathway was significantly enriched:

ggplot(fgseaResTidy, aes(reorder(pathway, NES), NES)) +
  geom_col(aes(fill=padj<0.05)) +
  coord_flip() +
  labs(x="Pathway", y="Normalized Enrichment Score",
       title="GSEA Hallmark Pathways") + 
  theme_minimal()+
  theme(axis.text.y = element_text(size = 6))

Plot of the significant pathways. Color indicates if the pathways are enriched in the low-risk group (green) or in the high-risk group (pink)

significant_pathways <- fgseaResTidy %>% 
                          filter(padj<0.05) %>% 
                          mutate(pathway = str_replace_all(pathway, "_", " ")) %>% 
                          mutate(pathway = str_remove_all(pathway, "HALLMARK"))

color = ifelse(significant_pathways$NES > 0, "deeppink3", "chartreuse3")
GSEA_plot <- ggplot(significant_pathways, aes(x = reorder(pathway, NES), y = NES)) +
  geom_bar(stat = "identity", fill = color, linewidth = 1)+ # evt: color = "black",
  coord_flip()+
  labs(x="Pathway", y="Normalized Enrichment Score") +
  theme_bw()+
  theme(axis.text = element_text(size = 12, face = "bold"), axis.title = element_text(size = 14, face = "bold"))
GSEA_plot

ggsave(file = "plots/GSEA.plot.one_model.png", plot= GSEA_plot, height = 13, width = 10)
Enrichment plots

From the graph above, we can easily identify the top enriched pathway in high-risk tumors: Hallmark E2F targets. E2F targets include genes/proteins involved in initiation of replication, nucleotide synthesis and DNA synthesis. The enrichment plot for this pathway is plotted below:

plotEnrichment(pathways.hallmark[["HALLMARK_E2F_TARGETS"]],
               ranks) + labs(title="E2F targets pathway")

The top downregulated pathway in high-risk tumors is the myogenesis pathway.

plotEnrichment(pathways.hallmark[["HALLMARK_MYOGENESIS"]],
               ranks) + labs(title="Hallmark myogenesis")

Collapsed pathway plot
collapsedPathways <- collapsePathways(fgseaRes[order(pval)][padj < 0.01], 
                                      pathways.hallmark, ranks)
mainPathways <- fgseaRes[pathway %in% collapsedPathways$mainPathways][
  order(-NES), pathway]
plotGseaTable(pathways.hallmark[mainPathways], ranks, fgseaRes, 
              gseaParam = 0.5,  pathwayLabelStyle = list(size=8))